********************************************************************************
* reformat RCA data

program define infutorize_addresses

{

* take cleaned RCA data
use rca_cleaned.dta ,clear

* do a lot of repetitive string operations to get addresses to match infutor 
* file as closely as possible
gen add = address_tx
split add, parse(" ")

gen add_hhnum = add1, after(add1)
gen add_pdirect = "", after(add_hhnum)
gen add_street = "", after(add_pdirect)
gen add_st = "", after(add_street)
gen add_pstdirect = "", after(add_st)
gen add_city = "", after(add_pstdirect)
gen add_state = "", after(add_city)
gen add_zip = "", after(add_state)

replace add_hhnum = "" if add_hhnum == "Executive"
replace add_hhnum = "" if add_hhnum == "Ben"
replace add_hhnum = "" if add_hhnum == "Broad"
replace add_hhnum = "" if add_hhnum == "Coleman"
replace add_hhnum = "" if add_hhnum == "Dessau"
replace add_hhnum = "" if add_hhnum == "E"
replace add_hhnum = "" if add_hhnum == "Half"
replace add_hhnum = "" if add_hhnum == "N"
replace add_hhnum = "" if add_hhnum == "NEC"
replace add_hhnum = "" if add_hhnum == "Roxboro"
replace add_hhnum = "" if add_hhnum == "SEC"
replace add_hhnum = "" if add_hhnum == "W"
replace add_hhnum = "" if add_hhnum == "Welton"


gen x = strpos(add_hhnum, "-") + 1


gen z = x - strlen(add_hhnum)
gen lastnum = substr(add_hhnum, x, -1*z+1) if x > 0
replace lastnum = "4222" if   add_hhnum  == "4218-"
replace   add_hhnum = "4218" if   add_hhnum  == "4218-"
replace lastnum = "" if add_hhnum == ""
replace add_hhnum = substr(add_hhnum, 1, -1*z + 1)
replace add_hhnum = "5" if add_hhnum == "5-"
replace add_hhnum = "" if add_hhnum == "6th"
replace add_hhnum = "4218" if lastnum == "4222"
replace lastnum = "" if strpos(lastnum, "s") > 0
replace lastnum = "" if strpos(lastnum, "t") > 0


drop add1

replace add_pdirect = add2 if inlist(add2, "E", "N", "NE", "NW", "S", "SE", "SW", "W")
replace add2 = "" if inlist(add2, "E", "N", "NE", "NW", "S", "SE", "SW", "W")
replace add_street = add2 if add_street == ""
drop add2

// Fill in with add3 contents
replace add_street = add3 if add_pdirect != "" & add_hhnum != "" & add_street == ""
replace add_street = "Los Angeles" if add3 == "Los" & add4 == "Angeles" 
replace add3 = "" if add_street == "Los Angeles" 
replace add4 = "" if add_street == "Los Angeles"
replace add3 = "" if add_street == add3

replace add_st = add3 if inlist(add3, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Hwy")
replace add3 = "" if  inlist(add3, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Hwy")
replace add_st = add3 if inlist(add3, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")
replace add3 = "" if inlist(add3, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")

replace add_pstdirect = add3 if inlist(add3, "E", "N", "NE", "NW", "S", "SE", "SW", "W")
replace add3 = "" if inlist(add3, "E", "N", "NE", "NW", "S", "SE", "SW", "W")

replace add_city = add3 if inlist(add3, "Brooklyn", "Denver")
replace add3 = "" if inlist(add3, "Brooklyn", "Denver")
replace add_street = add_street + " " + add3
drop add3


// Fill in with add4 contents
replace add_st = add4 if inlist(add4, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Ln", "Hwy")
replace add4 = "" if inlist(add4, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Ln", "Hwy")
replace add_st = add4 if inlist(add4, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")
replace add4 = "" if inlist(add4, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")
replace add_st = add4 if add4 == "St."
replace add4 = "" if add4 == "St."

replace add_pstdirect = add4 if inlist(add4, "E", "N", "NE", "NW", "S", "SE", "SW", "W")
replace add4 = "" if inlist(add4, "E", "N", "NE", "NW", "S", "SE", "SW", "W")

replace add_city = add4 if inlist(add4, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add4 = "" if inlist(add4, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add_city = add4 if inlist(add4, "Washington", "Chicago", "North", "West", "Woodland", "Marina", "Englewood", "Studio")
replace add4 = "" if inlist(add4, "Washington", "Chicago", "North", "West", "Woodland", "Marina", "Englewood", "Studio")

replace add_state = add4 if inlist(add4, "CO", "NY")
replace add4 = "" if inlist(add4, "CO", "NY")
replace add4 = "" if add_street == add4
replace add_street = add_street + " " + add4
drop add4


// Fill in with add5 contents
replace add_st = add5 if inlist(add5, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Ln", "Hwy")
replace add5 = "" if inlist(add5, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Ln", "Hwy")
replace add_st = add5 if inlist(add5, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")
replace add5 = "" if inlist(add5, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")

replace add_pstdirect = add5 if inlist(add5, "E", "N", "NE", "NW", "S", "SE", "SW", "W")
replace add5 = "" if inlist(add5, "E", "N", "NE", "NW", "S", "SE", "SW", "W")

replace add_state = add5 if inlist(add5, "CA", "CO", "DC", "GA", "IL")
replace add5 = "" if inlist(add5, "CA", "CO", "DC", "GA", "IL")
replace add_state = add5 if inlist(add5, "NY", "OR", "PA", "TX", "WA")
replace add5 = "" if inlist(add5, "NY", "OR", "PA", "TX", "WA")

replace add_city = add5 if inlist(add5, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add5 = "" if inlist(add5, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add_city = add5 if inlist(add5, "Washington", "Chicago", "North", "Woodland", "Marina", "Englewood", "West", "Playa")
replace add5 = "" if inlist(add5, "Washington", "Chicago", "North", "Woodland", "Marina", "Englewood", "West", "Playa")
replace add_city = add_city + " " + add5 if inlist(add5, "Francisco", "Hollywood", "Angeles", "Del", "del", "City", "Hills")
replace add5 = "" if inlist(add5, "Francisco", "Hollywood", "Angeles", "Del", "del", "City", "Hills")


replace add_street = add_street + " " + add5
drop add5


// Fill in with add6 contents
replace add_st = add6 if inlist(add6, "Blvd", "Shr")
replace add6 = "" if inlist(add6, "Blvd", "Shr")

replace add_state = add6 if inlist(add6, "CA", "CO", "DC", "GA", "IL")
replace add6 = "" if inlist(add6, "CA", "CO", "DC", "GA", "IL")
replace add_state = add6 if inlist(add6, "NY", "OR", "PA", "TX", "WA")
replace add6 = "" if inlist(add6, "NY", "OR", "PA", "TX", "WA")

replace add_pstdirect = add6 if inlist(add6, "E", "NE", "NW", "S")
replace add6 = "" if inlist(add6, "E", "NE", "NW", "S")

replace add_city = add6 if inlist(add6, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add6 = "" if inlist(add6, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add_city = add6 if inlist(add6, "Washington", "Chicago", "North", "Woodland", "Marina", "Englewood", "West")
replace add6 = "" if inlist(add6, "Washington", "Chicago", "North", "Woodland", "Marina", "Englewood", "West")
replace add_city = add_city + " " + add6 if inlist(add6, "Francisco", "Hollywood", "Angeles", "Rey", "Hills", "Pedro", "Vista", "del", "Del")
replace add6 = "" if inlist(add6, "Francisco", "Hollywood", "Angeles", "Rey", "Hills", "Pedro", "Vista", "del", "Del")

drop add6


// Fill in with add7 contents
replace add_st = add7 if inlist(add7, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Ln", "Hwy")
replace add7 = "" if inlist(add7, "Ave", "Blvd", "Dr", "Pl", "Rd", "St", "Way", "Ln", "Hwy")
replace add_st = add7 if inlist(add7, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")
replace add7 = "" if inlist(add7, "Pky", "Xing", "Ctr", "Ct", "Trl", "Cir", "Crk", "Sq", "Plz")

replace add_pstdirect = add7 if inlist(add7, "E", "N", "NE", "NW", "S", "SE", "SW", "W")
replace add7 = "" if inlist(add7, "E", "N", "NE", "NW", "S", "SE", "SW", "W")

replace add_state = add7 if inlist(add7, "CA", "CO", "DC", "GA", "IL")
replace add7 = "" if inlist(add7, "CA", "CO", "DC", "GA", "IL")
replace add_state = add7 if inlist(add7, "NY", "OR", "PA", "TX", "WA")
replace add7 = "" if inlist(add7, "NY", "OR", "PA", "TX", "WA")

replace add_city = add7 if inlist(add7, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add7 = "" if inlist(add7, "Atlanta", "Austin", "Brooklyn", "Denver", "Los", "Philadelphia", "Portland", "San", "Seattle")
replace add_city = add7 if inlist(add7, "Washington", "Chicago", "North", "Woodland", "Marina", "Englewood", "West", "Playa")
replace add7 = "" if inlist(add7, "Washington", "Chicago", "North", "Woodland", "Marina", "Englewood", "West", "Playa")
replace add_city = add_city + " " + add7 if inlist(add7, "Francisco", "Hollywood", "Angeles", "Del", "del", "City", "Hills")
replace add7 = "" if inlist(add7, "Francisco", "Hollywood", "Angeles", "Del", "del", "City", "Hills")


replace add_street = add_street + " " + add7

drop add7


// Fill in with add8 contents
replace add_state = add8 if inlist(add8, "CA", "GA", "TX") & add_state == ""
replace add8 = "" if add_state == add8

replace add_zip = add8 if add_zip == ""
replace add8 = "" if add_zip == add8
drop add8

drop add_city add_state add_zip x z 
gen add_state = stateprov_cd
gen add_zip = postalcode_tx
gen add_city = city

destring(*hhnum), replace force


replace lastnum = "" if strpos(address_tx, "-") == 0
destring lastnum, replace
gen n = abs((lastnum-add_hhnum)+1)


* reorder variables and concaenate
replace n = 2 if n != .
expand n

order(add_hhnum lastnum add_zip add_state), after(add_city)

foreach var of varlist add - add_city {
	replace `var' = upper(strtrim(stritrim(`var')))
}
destring add_zip, replace

bysort id: gen hhnum_new = add_hhnum if lastnum == .
bysort id (hhnum_new): replace hhnum_new = lastnum if n == 2 & _n == 2

gen ahhnum = string(hhnum_new)

gen full_address = add if strpos(add, "-") == 0
gen x = strlen(string(add_hhnum))
gen y = strpos(address_tx, "-")

* reformat some unmatched observations
replace hhnum = hhnum_new
replace hhnum = 0 if hhnum == . 

replace full_address = ahhnum + substr(add, y + 1 + x, .) if full_address == "" & ahhnum != "."

gen f = 1 if full_address == "" & ahhnum == "."
replace ahhnum = string(add_hhnum) if f == 1
replace full_address = string(add_hhnum) + substr(add, y + 1 + x, .) if f == 1

drop f

*finally, manually fix where the address is denoted in a different way

replace ahhnum = "3817" if full_address == "38 N BROADWAY"
replace ahhnum = "3845" if full_address == "45 N BROADWAY"
replace full_address = "3817 N BROADWAY" if full_address == "38 N BROADWAY"
replace full_address = "3845 N BROADWAY" if full_address == "45 N BROADWAY"

replace ahhnum = "" if ahhnum == "."
 
replace full_address = add if add_st == "TX" & full == "11024CH RD" | full == "8715" | full == "8000"
replace ahhnum = "109C" if full == "109C DENSON DR"

rename ahhnum hhnum

* some final renaming
rename add_zip zip

replace add_st = "ST" if add_street == "ST"
replace add_st = "RD" if add_street == "RD"
replace add_st = "DR" if add_street == "DR"

gen street = add_street

replace street = "" if add_street == "RD" | add_street == "ST"
replace street = "" if strpos(add_street, "AND") > 1
replace street = "" if strpos(add_street, "&") > 1
replace street = "" if strlen(add_street) > 5

sort street

replace street = "" in 1688/1790
replace street = "" in 1800/1820
replace street = "" in 1827/1837
replace street = "" in 1844/1855
replace street = "" in 1858/1881
replace street = "" in 1891/1935
replace street = "" in 1940/2009
replace street = "" in 2012/2021
replace street = "" in 2023/2054

rename street street_new

* the final output is the RCA data with addresses in infutor format
save rca_cleaned_v3.dta, replace

}

end


********************************************************************************
* match new building addresses in the master list of infutor addresses

program define find_matches

{

set more off

// Subset the infutor address list file to zips contained in our data

* get the zips with a new building
use rca_cleaned_v3.dta, clear
duplicates drop full_address, force
keep zip
duplicates drop *, force
destring(zip), replace
save zip_list, replace

* load address file and subset
use geocoded_address_list,clear
keep if inlist(state, "CA", "CO", "DC", "GA", "IL") | inlist(state, "NY", "OR", "PA", "TX", "WA")
keep if inlist(dataset_merged, 2, 3)

merge m:1 zip using zip_list
keep if _merge == 3
drop _merge
tostring zip, replace

gen street2 = street
replace street = ""
replace street = street2 if strlen(street2) < 11

gen letters = substr(street2, -1, 1)
gen z = strlen(street2)
gen test = substr(street2, 1, z-1)

replace street = letters if test == "AVENUE "
replace street = "" if strlen(street) > 5 
gen firstchar = substr(street, 1, 1)
gen street_short = street
replace street_short = "" if strlen(street) > 1 & firstchar != "1" & firstchar != "2" & firstchar != "3" & firstchar != "4" & firstchar != "5" & firstchar != "6" & firstchar != "7"& firstchar != "8" & firstchar != "9" & firstchar != "0"

gen street_infutor = street 
save merged_infutor_addresses_subset_relzips.dta, replace

// Join in RCA buildings coarsely based on hh_num and zip code
// then create long address variables that we use for string matching
destring(zip), replace
joinby hhnum zip using rca_cleaned_v3.dta
gen street3 = add_street

tostring zip, replace
gen full_address_infutor = full_address + " "+add_city+" "+add_state+" "+zip, after(id)
gen short_address_infutor = full_address, after(full_address_infutor)
replace short_address_infutor = strtrim(stritrim(short_address_infutor))
replace full_address_infutor = strtrim(stritrim(full_address_infutor))

gen full_address_new = pdirect+" "+hhnum+ " " +street2+" "+st+" "+pstdirect + " " +city+" "+state+" "+zip, after(id)
gen short_address_new = pdirect+" "+hhnum+ " " +street2+" "+st+" "+pstdirect, after(full_address_new)
replace short_address_new = strtrim(stritrim(short_address_new))
replace full_address_new = strtrim(stritrim(full_address_new))
save merged_infutor-zillow.dta, replace

// Use a series of matchit functions to determine best matches

* take high similarity scores or exact matches.  Also measure distance
use merged_infutor-zillow.dta, replace
drop if street_new == "" & street_infutor == ""
drop if street_infutor == "" & street_new != ""
drop if street_infutor != "" & street_new == ""

matchit short_address_infutor short_address_new, gen(simil)
geodist lat lon latitude longitude, gen(distance)
save merged_infutor-zillow_g2.dta, replace

keep if simil == 1
save merged_infutor-zillow_g2_1.dta, replace

use merged_infutor-zillow_g2.dta, replace
drop if simil == 1
drop if simil < .5
matchit street_new street_infutor, gen(street_score)
drop if street_score != 1
drop street_score
save merged_infutor-zillow_g2_2.dta, replace

* do this again merging in manually adjusted bad matches from the first step 
use merged_infutor-zillow.dta, replace
merge m:1 full_address_infutor full_address_new using  SR_manual_bads.dta, gen(m2)
drop m2

keep if Junk != "J"
keep if street_new == "" & street_infutor == "" | Junk == "G" 
matchit short_address_infutor short_address_new , gen(simil)
bysort full_address_new: egen max_score = max(simil)
gen diff = simil - max_score
geodist lat lon latitude longitude, gen(distance)
keep if abs(diff) < 0.0001 | distance<.1 & Junk != "G"
foreach var of varlist full_address_new - longitude {
	capture replace `var' = "" if similscore < 0.25 & distance>.1
	capture replace `var' = . if  similscore < 0.25 & distance>.1
}

* keep a subset of either high similarity scores
drop if simil < .25 & dist > 1 & diff != 0 & Junk != "G"
drop if simil < .5  & diff != 0 & Junk != "G"
drop if distance > 1  & diff != 0 & Junk != "G"
drop if simil <.75  & diff != 0 & Junk != "G"
drop if diff == 0 & simil < .5 & distance > 1.5 & Junk != "G"
drop if diff == 0 & simil < .51 & distance > 1.25 & Junk != "G"

save merged_infutor-zillow_g1.dta, replace


// Combine the two runs and then keep the max from the two similarity scores OR very close distances
use merged_infutor-zillow_g1.dta, replace
append using merged_infutor-zillow_g2_1.dta
append using merged_infutor-zillow_g2_2.dta

drop max_score diff
bysort full_address_new: egen max_score = max(simil)
gen diff = simil - max_score
keep if abs(diff) < 0.0001 | distance<.1
drop max_score diff

// if highest similarity score is less than 0.25, mark it as a nonmatch
foreach var of varlist full_address_infutor - longitude {
	capture replace `var' = "" if similscore < 0.25 & distance>.1
	capture replace `var' = . if  similscore < 0.25 & distance>.1
}

* clean up variables
drop rcametros_tx propertyname_tx address_tx city_tx county_tx stateprov_cd postalcode_tx yearblt_nb units sqft land_area features_tx final_source base_id met building_date month add add_pdirect add_street add_st add_pstdirect add_city add_hhnum lastnum add_state n hhnum_new full_address x y   

* do some manual checking of remaining pairs
duplicates tag full_address_inf, gen(dup_tag)
drop if id == 4627 & short_address_new != "3052 BRIGHTON 1ST ST"
drop if id == 2930 & short_address_new != "1111 ORREN ST NE"
drop if id == 4237 & short_address_new != "1605 BELLEVUE AVE"
drop if id != 1665 & short_address_new == "900 BATTERY AVE SE"
drop if id == 4627 & short_address_new != "3052 BRIGHTON 1ST ST"
replace dup_tag = 0 if dup_tag == 6
drop if id == 2930 & short_address_new != "1111 ORREN ST NE"
drop if id == 4237 & short_address_new != "1605 BELLEVUE AVE"
drop if id != 1665 & short_address_new == "900 BATTERY AVE SE"
drop if id == 4366  & city != "DENVER"
replace dup_tag = 0 if dup_tag == 3
bysort full_address_infu : egen max_score = max(simil)
drop if id == 1783 & short_address_new != "N 1201 LA SALLE DR"
drop if id == 3339 & short_address_new != "1317 NEW YORK AVE"
drop if id == 350 & short_address_new != "1351 DEKALB AVE"
drop if id == 3405 & short_address_new != "1430 SOUTH ST"
drop if id == 440 & dup_tag == 2
drop if id == 2805 & short_address_new != "S 2301 LAMAR BLVD"
drop if id == 2948 & short_address_new != "3233 ELIOT ST"
drop if id == 951 & dup_tag == 2
drop if id == 336 & dup_tag == 2
drop if id == 170 & short_address_new != "7043 JORDAN AVE"
drop if id == 1654 & dup_tag == 2
drop if id == 4117   & short_address_new != "96 STEUBEN ST"
replace dup_tag = 0 if dup_tag == 2
drop if id == 3301   & short_address_new == "10 HANOVER PL"
drop if id == 1868   & short_address_new == "E 1001 DAKOTA AVE"
bysort full_address_infutor: egen max_score2 = max(simil)
replace max_score=max_score2
drop max_score2

gen diff = simil - max_score
keep if abs(diff) < 0.0001 | distance<.1
drop dup_tag
duplicates tag full_address_inf, gen(dup_tag)

* after checking, drop duplicates
duplicates drop full_address_infutor, force

drop street_infutor
drop street_new
rename street2 street_infutor
rename street3 street_new

* the final output file is a merged set of infutor addresses and rca buildings
save  merged_infutor-zillow_lesstight.dta, replace

}

end


********************************************************************************
* put together subset of infutor addresses for our MSAs, update incorrect lat/lons

program define subset_addresses

{

// First load the infutor address list and drop obs in other states
use geocoded_address_list,clear
rename street street_infutor
keep if inlist(state, "CA", "CO", "DC", "GA", "IL") | inlist(state, "NY", "OR", "PA", "TX", "WA")
keep if inlist(dataset_merged, 2, 3)
rename (_Y _X) (lat lon)
keep if lat != . & lon != .

// Add in tract characteristics from acs
keep state - lon GEOID
drop if GEOID == ""
merge m:1 GEOID using all_tracts_2017"
keep if inlist(_merge, 1, 3)
drop   _merge

// Change discrepancies between infutor and rca lat/lon for new buildings to RCA, which we checked manually
tostring zip, replace
merge 1:1 state hhnum pdirect street_infutor st pstdirect city zip using merged_infutor-zillow_lesstight.dta, update replace
drop _merge

// final output is list of addresses in relevant states
destring(zip), replace
rename street_infutor street
save migration_address_subset.dta, replace

}

end


********************************************************************************
* pull moves close to new buildings

program define pull_moves


{


// We break the states into three groups and find the people in each group who have
* ever lived near one of the new building (to reduce ram requirements)

****************** ROUND 1 *************************

{

* append half of the states in our sample
use infutor_100pct_DC_ALLgeoids.dta, clear
keep(pid sex_crd dob hhnum_* pdirect_* street_* st_* pstdirect_* city_* ///
	state_* zip_* *_date_* GEOID_*)
	
	
foreach state in CO CA GA IL  {
	di "Loading: `state'"
	
	append using infutor_100pct_`state'_ALLgeoids.dta ///
	, nolabel nonotes keep(pid sex_crd dob hhnum_* pdirect_* street_* st_* pstdirect_* city_* ///
	state_* zip_* *_date_* GEOID_*)	
	
	}

// Keep necessary data
drop if state_0 == "" & zip_0 == .
drop *_narc

// Merge coordinates and tract char to each address, get distance from new buildings
* must iterate over 10 addresses included in infutor

local i = 0
while `i' <= 9 {

    di "1 Addresss`i'"

    // Rename for merging with coords
    rename (GEOID_`i' hhnum_`i' pdirect_`i' street_`i' st_`i' pstdirect_`i' city_`i' state_`i' zip_`i') ///
        (GEOID hhnum pdirect street st pstdirect city state zip)

    // Merge on lats and lons and characteristics
	capture rename latitude lat
	capture rename longitude lon
    merge m:1 state hhnum pdirect street st pstdirect city zip using migration_address_subset.dta, keep(1 3) keepus(lat lon) nogen
	capture rename latitude lat
	capture rename longitude lon
   
   merge m:1 GEOID using all_tracts_2017.dta, keep(1 3) keepus(med_hh_inc pct_college pct_white pct_students) nogen
   
    di "2 Addresss`i'"  
    
    
    // use geonear to get distance from new buildings and merge on apt year build
	capture rename latitude lat
	capture rename longitude lon
    geonear pid lat lon using   rca_cleaned.dta, n(base_id lat lon) near(1) genstub(base_id) wide
    tostring base_id, replace
    replace base_id = "" if km_to_base_id > 1
    replace km_to_base_id = . if km_to_base_id > 1
    rename km_to_base_id km_to_apt

    di "3 Addresss`i'"  
    
    // Rename to normal vars
    capture rename latitude lat
    capture rename longitude lon
    capture order lat lon med_hh_inc pct_college pct_white base_id km_to_apt, after(zip)
    
    rename (GEOID hhnum pdirect street st pstdirect city state zip lat lon base_id med_hh_inc pct_college pct_white pct_students) ///
        (GEOID_`i' hhnum_`i' pdirect_`i' street_`i' st_`i' pstdirect_`i' city_`i' state_`i' zip_`i' lat_`i' lon_`i' base_id_`i' med_hh_inc_`i' pct_college_`i' pct_white_`i' pct_students_`i')

	
	 rename km_to_apt km_to_apt_`i' 
	 capture rename km_to_base_id km_to_base_id_`i'
    di "4 Addresss`i'"   	
    local i = `i' + 1
    
}

// Create indicator for observations that are never within a km of new apartment
gen nonew = (km_to_apt_0 == . & km_to_apt_1 == . & km_to_apt_2 == . & km_to_apt_3 == . & km_to_apt_4 == . & ///
    km_to_apt_5 == . & km_to_apt_6 == . & km_to_apt_7 == . & km_to_apt_8 == . & km_to_apt_9 == .)
	
* drop these people	
drop if nonew == 1
	
// Save addresses with new buildings merged on
compress
save  nearby_moves_p1.dta, replace

}


****************** ROUND 2 *************************

* repeat with other half of sample states 
{
use infutor_100pct_NY_ALLgeoids.dta, replace
keep(pid sex_crd dob hhnum_* pdirect_* street_* st_* pstdirect_* city_* ///
	state_* zip_* *_date_* GEOID_*)
	

foreach state in OR PA TX WA  {
	di "Loading: `state'"
	
	append using infutor_100pct_`state'_ALLgeoids.dta ///
	, nolabel nonotes keep(pid sex_crd dob hhnum_* pdirect_* street_* st_* pstdirect_* city_* ///
	state_* zip_* *_date_* GEOID_*)	
	
	}

drop if state_0 == "" & zip_0 == .
drop *_narc

local i = 0
while `i' <= 9 {

    di "1 Addresss`i'"

    rename (GEOID_`i' hhnum_`i' pdirect_`i' street_`i' st_`i' pstdirect_`i' city_`i' state_`i' zip_`i') ///
        (GEOID hhnum pdirect street st pstdirect city state zip)

capture rename latitude lat
capture rename longitude lon
    merge m:1 state hhnum pdirect street st pstdirect city zip using migration_address_subset.dta, keep(1 3) keepus(lat lon) nogen
capture rename latitude lat
capture rename longitude lon
   
   merge m:1 GEOID using all_tracts_2017.dta, keep(1 3) keepus(med_hh_inc pct_college pct_white pct_students) nogen
   
    di "2 Addresss`i'"  
    
    capture rename latitude lat
capture rename longitude lon
    geonear pid lat lon using   rca_cleaned.dta, n(base_id lat lon) near(1) genstub(base_id) wide
    tostring base_id, replace
    replace base_id = "" if km_to_base_id > 1
    replace km_to_base_id = . if km_to_base_id > 1
    rename km_to_base_id km_to_apt

    di "3 Addresss`i'"  
    
    capture rename latitude lat
    capture rename longitude lon
    capture order lat lon med_hh_inc pct_college pct_white base_id km_to_apt, after(zip)
    
    rename (GEOID hhnum pdirect street st pstdirect city state zip lat lon base_id med_hh_inc pct_college pct_white pct_students) ///
        (GEOID_`i' hhnum_`i' pdirect_`i' street_`i' st_`i' pstdirect_`i' city_`i' state_`i' zip_`i' lat_`i' lon_`i' base_id_`i' med_hh_inc_`i' pct_college_`i' pct_white_`i' pct_students_`i')

	
	 rename km_to_apt km_to_apt_`i' 
	 capture rename km_to_base_id km_to_base_id_`i'
    di "4 Addresss`i'"   	
    local i = `i' + 1
    
}

	gen nonew = (km_to_apt_0 == . & km_to_apt_1 == . & km_to_apt_2 == . & km_to_apt_3 == . & km_to_apt_4 == . & ///
		km_to_apt_5 == . & km_to_apt_6 == . & km_to_apt_7 == . & km_to_apt_8 == . & km_to_apt_9 == .)
	drop if nonew == 1
	
compress
save  nearby_moves_p2.dta, replace

}


****************** ROUND 3 *************************
* finally, look for people who are not currently in one of the sample states 
* but previously did live near a building in the sample

{
use infutor_100pct_AK_ALLgeoids.dta, replace
keep(pid sex_crd dob hhnum_* pdirect_* street_* st_* pstdirect_* city_* ///
	state_* zip_* *_date_* GEOID_*)
	

foreach state in AL AR AZ CT DE FL HI IA ID IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV OH OK RI  SC SD TN UT VA VT WI WV {
	di "Loading: `state'"
	
	append using infutor_100pct_`state'_ALLgeoids.dta ///
	, nolabel nonotes keep(pid sex_crd dob hhnum_* pdirect_* street_* st_* pstdirect_* city_* ///
	state_* zip_* *_date_* GEOID_*)	
	
	}

drop if state_0 == "" & zip_0 == .
drop *_narc

local i = 0
while `i' <= 9 {

    di "1 Addresss`i'"

    rename (GEOID_`i' hhnum_`i' pdirect_`i' street_`i' st_`i' pstdirect_`i' city_`i' state_`i' zip_`i') ///
        (GEOID hhnum pdirect street st pstdirect city state zip)

capture rename latitude lat
capture rename longitude lon
    merge m:1 state hhnum pdirect street st pstdirect city zip using migration_address_subset.dta, keep(1 3) keepus(lat lon) nogen
capture rename latitude lat
capture rename longitude lon
   
   merge m:1 GEOID using all_tracts_2017.dta, keep(1 3) keepus(med_hh_inc pct_college pct_white pct_students) nogen
   
    di "2 Addresss`i'"  
    
    capture rename latitude lat
capture rename longitude lon
    geonear pid lat lon using   rca_cleaned.dta, n(base_id lat lon) near(1) genstub(base_id) wide
    tostring base_id, replace
    replace base_id = "" if km_to_base_id > 1
    replace km_to_base_id = . if km_to_base_id > 1
    rename km_to_base_id km_to_apt

    di "3 Addresss`i'"  
    
    capture rename latitude lat
    capture rename longitude lon
    capture order lat lon med_hh_inc pct_college pct_white base_id km_to_apt, after(zip)
    
    rename (GEOID hhnum pdirect street st pstdirect city state zip lat lon base_id med_hh_inc pct_college pct_white pct_students) ///
        (GEOID_`i' hhnum_`i' pdirect_`i' street_`i' st_`i' pstdirect_`i' city_`i' state_`i' zip_`i' lat_`i' lon_`i' base_id_`i' med_hh_inc_`i' pct_college_`i' pct_white_`i' pct_students_`i')

	
	 rename km_to_apt km_to_apt_`i' 
	 capture rename km_to_base_id km_to_base_id_`i'
    di "4 Addresss`i'"   	
    local i = `i' + 1
    
}

gen nonew = (km_to_apt_0 == . & km_to_apt_1 == . & km_to_apt_2 == . & km_to_apt_3 == . & km_to_apt_4 == . & ///
    km_to_apt_5 == . & km_to_apt_6 == . & km_to_apt_7 == . & km_to_apt_8 == . & km_to_apt_9 == .)
drop if nonew == 1
gen rd3=1
	
compress
save  nearby_moves_p3.dta, replace



}

****************************************************
* append the data sets 
use  nearby_moves_p1.dta, clear

append using  nearby_moves_p2.dta
append using  nearby_moves_p3.dta

* final output is one data file with the address history of everyone who's lived
* near a new building in the sample

save  nearby_moves", replace


}

end


********************************************************************************
* reshape moves and construct analysis file with distance to closest building

program define long_file

{


* first, we load up the wide file of anyone who has moved to/from an apartment
* near one of our luxury apartments
* the base_id_x variable tells which complex they lived near at address x, 
* and the km_to_x gives their distance

use  nearby_moves.dta, clear
keep base_id* pct_student* pct_white_* pct_college_* med_hh_inc_* dob base_id* km* combo_date* state* hhnum* pdirect* street* st* pstdirect* city* zip* lat* lon*

* we manually reshape to a long file where an observation is a move
gen double person_id = _n	
egen nonmiss_ids = rownonmiss(hhnum_*),strok
expand nonmiss_ids
bysort person_id: gen id_num = _n
replace id_num = id_num-1

local varlist  base_id_ state_ hhnum_ pdirect_ street_ st_ pstdirect_ city_  
foreach var in `varlist'{

	gen `var'=""
	
	forvalues i = 0/9{
	
		qui replace `var' = `var'`i' if id_num == `i'
		
	}
		
}	
	
local varlist 	pct_white_ pct_college_ pct_students_ med_hh_inc_ combo_date_ km_to_apt_ zip_ lat_ lon_
foreach var in `varlist'{

gen `var'=.

forvalues i = 0/9{

	qui replace `var' = `var'`i' if id_num == `i'
	
}
	
}	

* drop excess variables
drop *_0 *_1 *_2 *_3 *_4 *_5 *_6 *_7 *_8 *_9 
drop if med_hh_inc_ ==.

bysort person_id (combo_date_): gen round=_n
save  raw_migration_long.dta, replace

* next, we take the raw long file and lag/lead addresses and try to identify who is in the new apartments

use  raw_migration_long.dta, clear

* lag/lead addresses
tsset person_id round
bys person_id: egen max_round=max(round)
drop if max_round==0
gen year=year(combo_date)

* data sometimes contains "fake" moves within the same building, often from the 
* address with a unit number to the same address with no unit number.  drop these
gen fake_move=0
replace fake_move=1 if km_to_apt==L.km_to_apt & zip==L.zip
drop if fake_move==1
drop round fake_move
bysort person_id (combo_date_): gen round=_n
tsset person_id round

* create origin and destination income variables for each move
gen origin_inc=L.med_hh_inc_
gen dest_inc=F.med_hh_inc_

gen origin_college=L.pct_college_
gen dest_college=F.pct_college_

gen origin_white=L.pct_white_
gen dest_white=F.pct_white_

gen origin_student=L.pct_students_
gen dest_student=F.pct_students_

encode state_, gen(state_num)

gen origin_state=L.state_num
gen dest_state=F.state_num
gen current_state=state_num

* merge in some information on the new buildings
rename base_id id
destring(id), replace
merge m:1 id using  apartment_distances.dta

drop if _merge==2
drop _merge
drop if id== .

merge m:1 id using  rca_characteristics.dta , 
keep if _merge==3
drop _merge

rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

save  less_raw_migration_long.dta, replace


* next, we identify the number of people who are in the new complexes
* this is crucial to figure out the net change of in-migrants to the surrounding
* neighborhoods


* first, just merge on our rca-infutor match from above and see if we get perfect matches
use  less_raw_migration_long.dta, clear
joinby id using  merged_infutor-zillow_lesstight.dta ///
	, unmatched(master)
rename _merge joinby_merge
drop full_address_new short_address_new full_address_infutor short_address_infutor street_new dist Junk rcametros_tx propertyname_tx ///
	yearblt_nb units sqft land_area features_tx month simil distance lat lon
destring(zip), replace
rename street_infutor street

* next, check the pairs from the matched observation.  Say that perfect matches
* or near perfect matches with short distances are matches
gen in_any=0
replace in_any=1 if hhnum==hhnum_ & street==street_ & zip==zip_
replace in_any=1 if km_to_apt<.005 & hhnum==hhnum_ & street==street_

* mext, bring in the more raw rca data and see if we get any matches on 
* both distance and address number.  check manually
merge m:1 id using deduped_infutorized_rca, keepus(add_street add_hhnum)
drop if _merge==2
drop _merge

tostring add_hhnum, replace
replace in_any=1 if km_to<.15 & hhnum_==add_hhnum ///
	& street_==add_street & hhnum_ != ""

* cut out expanded obs from the joinby
bys person_id round: egen currently_in_building=sum(in_any)
tab currently_in_building in_any

egen person_round_tag=tag(person round)
keep if person_round_tag==1

* check rate that we find people in buildings and remove people at the building
* addresses before the building's completion
tab currently_in_building if km_to_apt<.6 & year==2016
tab currently_in_building if km_to_apt<.4 & year==2016
tab currently_in_building if km_to_apt<.25 & year==2016
replace currently_in_building=1 if currently_in_building>1	
replace currently_in_building=0 if year<building_built

* rename things and save
drop id
rename (hhnum pdirect street st pstdirect city state zip lon lat) ///
	(hhnum_building pdirect_building street_building st_building pstdirect_building city_building state_building zip_building longitude_building latitude_building)
rename (state_ hhnum_ pdirect_ street_ st_ pstdirect_ city_) (state hhnum pdirect street st pstdirect city)
rename (pct_white_ pct_college_ med_hh_inc_ pct_students_) (current_white current_college current_inc current_student)
rename km_to_apt_ km_to_building
rename combo_date_ move_date
rename (zip_ ) (zip )
rename year move_year
rename b_nearby* buildings_within*
drop person_round_tag 
drop nonmiss_ids

order person_id round building_id km_to_building currently_in_building move_date move_year dob state hhnum pdirect street st pstdirect city zip lat lon ///
	current* origin* dest* *_building buildings_within* building_units 
	
* the final output is a long file with a move as an observation, building chars, tract chars, and lag/lead tract chars	
save  mig_analysis_file.dta,replace

}

end

*********************************************************************************
* convert long file to nearest 2014-2015 building, used for near-far dds

program define near_far_file

{

* get 2014-2015 buildings from RCA
use rca_cleaned,clear
keep if yearblt_nb==2014 | yearblt_nb==2015
rename latitude lat
rename longitude lon
save rca_2014_2015,replace

* use geonear to get each move linked to nearest 2014-2015 building
use mig_analysis_file,clear
keep building_id person_id round hhnum pdirect street st pstdirect city zip lat lon current_* origin* dest* move_year currently_in_building
gen double row_id=_n
geonear row_id lat lon using   rca_2014_2015.dta, n(base_id lat lon) near(1) genstub(base_id) wide

tostring base_id, replace
replace base_id = "" if km_to_base_id > 1
replace km_to_base_id = . if km_to_base_id > 1
rename km_to_base_id km_to_apt
rename building_id original_building_id

save raw_migration_long_20145,replace

* merge in some information about the nearby buildings and their tract
use raw_migration_long_20145, clear

rename base_id id
destring(id), replace
merge m:1 id using  apartment_distances.dta

drop if _merge==2
drop _merge
drop if id== .

merge m:1 id using  rca_characteristics.dta , 
keep if _merge==3
drop _merge

rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

* rename things
drop id
rename (hhnum pdirect street st pstdirect city zip lon lat) ///
	(hhnum_building pdirect_building street_building st_building pstdirect_building city_building zip_building longitude_building latitude_building)
rename km_to_apt km_to_building
rename b_nearby* buildings_within*

order person_id round building_id km_to_building currently_in_building move_year hhnum pdirect street pstdirect city zip lat lon ///
	current* origin* dest* *_building buildings_within* building_units

* fix incorrect building assignments (can result from people living in earlier buildings that are near 2014-2015 buildings)
replace currently_in_building=0 ///
	if currently_in_building==1 & building_id != original_building_id 

* add variables for regressions
rename km_to_building dist
gen log_inc=log(origin_inc)
gen log_college=log(origin_college)
gen log_white=log(origin_white)
keep if origin_inc!=.

* take care of people in buildings
replace dist=0 if currently_in_building==1
bys building_id: egen any_match=sum(currently_in_building)
replace currently_in_building=0 if move_year<building_built
replace currently_in_building=1 if dist<.02 & any_match==0 

* add origin income buckets
gen median_income=0
replace median_income = 66000 if msa=="atlanta" 
replace median_income = 74000 if msa=="austin" 
replace median_income = 69000 if msa=="chicago" 
replace median_income = 77000 if msa=="denver" 
replace median_income = 70000 if msa=="la" 
replace median_income = 76000 if msa=="brooklyn" 
replace median_income = 69000 if msa=="philadelphia" 
replace median_income = 72000 if msa=="portland" 
replace median_income = 96000 if msa=="sf" 
replace median_income = 83000 if msa=="seattle" 
replace median_income = 83000 if msa=="dc"

gen vlow_inc=(origin_inc<median_income*.66 & building_pct_student<.2)
gen high_inc=(origin_inc>median_income)

drop if dist>.6

* make treatment dummies
gen l250=(dist<.25)
gen l400=(dist<.4)

gen gap=move_year-building_built

forvalues i=1(1)7{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	gen tx4_`i'=0
	gen tx4_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & l250==1 
	replace tx_n`i'=1 if gap==-`i' & l250==1 

	replace tx4_`i'=1 if gap==`i' & l400==1 
	replace tx4_n`i'=1 if gap==-`i' & l400==1 

}

gen tx_0=0
replace tx_0=1 if gap==0 & l250==1 

gen tx4_0=0
replace tx4_0=1 if gap==0 & l400==1

* censor end points of treatment dummies
replace tx_n3=1 if gap<-3 & l250==1
replace tx_3=1 if gap>3 & l250==1

replace tx4_n3=1 if gap<-3 & l400==1
replace tx4_3=1 if gap>3 & l400==1

* make your time trend variables
egen building_id_year=group(building_id move_year)
egen ring_fe=group(building_id l250)
egen ring4_fe=group(building_id l400)
gen zero=0

* gentrification definitions for buildings
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000

* create sample indicators and weight variables 
gen building_sample=(inlist(building_built, 2014, 2015)>0 ///
	& building_units>=50 & building_income_g==1 ///
	& move_year>2010 & buildings_within_250_built==0 ///
	& building_pct_student<.2 & currently_in_building==0)
	
	
bys building_id move_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

	
forvalues i=2011/2017{

	bys building_id: egen tx250_number_in_`i'=sum(move_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2011/2017{

	bys building_id: egen tx256_number_in_`i'=sum(move_year==`i' & building_sample==1  & inrange(dist,.25,.6)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)

forvalues i=2011/2017{

	bys building_id: egen tx400_number_in_`i'=sum(move_year==`i' & building_sample==1 & l400==1)
	
}

* generate treatment variables
gen treated=l250
gen treated_after=treated*(move_year>building_built)
	
* this file is the basis for the near-far analyses	
save  mig_analysis_file_20145_all.dta,replace

}



end

********************************************************************************
* convert long file to nearest 2014-2015,2018 building, used for near-near dds

program define near_near_file

{

* pull correct buildings
use rca_cleaned,clear
keep if yearblt_nb==2014 | yearblt_nb==2015 | yearblt_nb==2018 
rename latitude lat
rename longitude lon
save rca_2014_2015_2018,replace

* associate moves to nearest buildings in time frame
use mig_analysis_file,clear
keep building_id person_id round hhnum pdirect street st pstdirect city zip lat lon current_* origin* dest* move_year currently_in_building add_hhnum add_street
gen double row_id=_n
geonear row_id lat lon using   rca_2014_2015_2018.dta, n(base_id lat lon) near(1) genstub(base_id) wide

tostring base_id, replace
replace base_id = "" if km_to_base_id > 1
replace km_to_base_id = . if km_to_base_id > 1
rename km_to_base_id km_to_apt
rename building_id original_building_id

save raw_migration_long_201458,replace

use raw_migration_long_201458, clear

* add building and tract char
rename base_id id
destring(id), replace
drop if id== .

merge m:1 id using  apartment_distances.dta

drop if _merge==2
drop _merge

merge m:1 id using  rca_characteristics.dta , 
keep if _merge==3
drop _merge

rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

* rename 
rename (hhnum pdirect street st pstdirect city zip lon lat) ///
	(hhnum_building pdirect_building street_building st_building pstdirect_building city_building zip_building longitude_building latitude_building)
rename km_to_apt km_to_building
rename b_nearby* buildings_within*


* fix incorrect building assignments (can result from people living in earlier buildings that are near 2014-2015 buildings)
replace currently_in_building=0 ///
	if currently_in_building==1 & building_id != original_building_id

order person_id round building_id km_to_building currently_in_building move_year hhnum pdirect street pstdirect city zip lat lon ///
	current* origin* dest* *_building buildings_within* building_units 

* generate variables for regressions	
drop building_tag
rename km_to_building dist
gen log_inc=log(origin_inc)
gen log_college=log(origin_college)
gen log_white=log(origin_white)
keep if origin_inc!=.
keep if move_year>2010

* throw out people in the new buildings
bys building_id: egen any_match=sum(currently_in_building)
replace currently_in_building=1 if ///
	dist<.02 & any_match==0
	
* add origin income buckets
gen median_income=0
replace median_income = 66000 if msa=="atlanta" 
replace median_income = 74000 if msa=="austin" 
replace median_income = 69000 if msa=="chicago" 
replace median_income = 77000 if msa=="denver" 
replace median_income = 70000 if msa=="la" 
replace median_income = 76000 if msa=="brooklyn" 
replace median_income = 69000 if msa=="philadelphia" 
replace median_income = 72000 if msa=="portland" 
replace median_income = 96000 if msa=="sf" 
replace median_income = 83000 if msa=="seattle" 
replace median_income = 83000 if msa=="dc"

gen vlow_inc=(origin_inc<median_income*.66 & building_pct_student<.2)
gen high_inc=(origin_inc>median_income)


* add gentrification definitions
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000
keep if building_income_g==1 

* make some treatment dummies
gen sample=(building_built<2018)
gen treated=sample
gen treated_after=treated*(move_year>building_built)
gen gap=move_year-building_built

forvalues i=1(1)5{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & sample==1 
	replace tx_n`i'=1 if gap==-`i' & sample==1 

}

replace tx_n3 = 1 if gap<-3 & sample==1
replace tx_3 = 1 if gap>3 & sample==1

* make your time trend variable
egen msa_year=group(msa move_year)
gen zero=0

*** save 250 file

preserve
local dist 250

* cut to desired buildings and radius
keep if dist<(`dist'/1000) 

gen building_sample=(inlist(building_built, 2014, 2015,2018)>0 ///
	& building_units>=50  & building_income_g==1 ///
	& move_year>2010 & buildings_within_`dist'_built==0 ///
	& building_pct_student<.25 & currently_in_building==0)
	
forvalues i=2011/2017{

	bys building_id: egen number_in_`i'=sum(move_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)
drop if building_sample==0 

egen building_tag=tag(building_id)
tab msa building_built if building_tag==1 & min_number>0

drop if min_number<1
	
* weights for each sample
bys building_id move_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	
	
save  mig_analysis_file_201458_250.dta,replace

restore

*** save 400 file

local dist 400

* cut to desired buildings and radius
keep if dist<(`dist'/1000) 

gen building_sample=(inlist(building_built, 2014, 2015,2018)>0 ///
	& building_units>=50  & building_income_g==1 ///
	& move_year>2010 & buildings_within_`dist'_built==0 ///
	& building_pct_student<.25 & currently_in_building==0)
	
forvalues i=2011/2017{

	bys building_id: egen number_in_`i'=sum(move_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)
drop if building_sample==0 

egen building_tag=tag(building_id)
tab msa building_built if building_tag==1 & min_number>0

drop if min_number<1
	
* weights for each sample
bys building_id move_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	
	
save  mig_analysis_file_201458_400.dta,replace


}

end

********************************************************************************
* convert long file to nearest ddd sample

program define ddd_file

{

* pull correct buildings
use rca_cleaned,clear
keep if yearblt_nb==2014 | yearblt_nb==2015 | yearblt_nb==2018 
rename latitude lat
rename longitude lon
save rca_2014_2015_2018,replace

* associate moves to nearest buildings in time frame
use mig_analysis_file,clear
keep building_id person_id round hhnum pdirect street st pstdirect city zip lat lon current_* origin* dest* move_year currently_in_building add_hhnum add_street
gen double row_id=_n
geonear row_id lat lon using   rca_2014_2015_2018.dta, n(base_id lat lon) near(1) genstub(base_id) wide

tostring base_id, replace
replace base_id = "" if km_to_base_id > 1
replace km_to_base_id = . if km_to_base_id > 1
rename km_to_base_id km_to_apt
rename building_id original_building_id

save raw_migration_long_201458,replace

use raw_migration_long_201458, clear

* add building and tract char
rename base_id id
destring(id), replace
drop if id== .

merge m:1 id using  apartment_distances.dta

drop if _merge==2
drop _merge

merge m:1 id using  rca_characteristics.dta , 
keep if _merge==3
drop _merge

rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

* rename 
rename (hhnum pdirect street st pstdirect city zip lon lat) ///
	(hhnum_building pdirect_building street_building st_building pstdirect_building city_building zip_building longitude_building latitude_building)
rename km_to_apt km_to_building
rename b_nearby* buildings_within*

* fix incorrect building assignments (can result from people living in earlier buildings that are near 2014-2015 buildings)
replace currently_in_building=0 ///
	if currently_in_building==1 & building_id != original_building_id

order person_id round building_id km_to_building currently_in_building move_year hhnum pdirect street pstdirect city zip lat lon ///
	current* origin* dest* *_building buildings_within* building_units 

* generate variables for regressions	
drop building_tag
rename km_to_building dist
gen log_inc=log(origin_inc)
gen log_college=log(origin_college)
gen log_white=log(origin_white)
keep if origin_inc!=.
keep if move_year>2010

* throw out people in the new buildings
bys building_id: egen any_match=sum(currently_in_building)
replace currently_in_building=1 if ///
	dist<.02 & any_match==0
	
* add origin income buckets
gen median_income=0
replace median_income = 66000 if msa=="atlanta" 
replace median_income = 74000 if msa=="austin" 
replace median_income = 69000 if msa=="chicago" 
replace median_income = 77000 if msa=="denver" 
replace median_income = 70000 if msa=="la" 
replace median_income = 76000 if msa=="brooklyn" 
replace median_income = 69000 if msa=="philadelphia" 
replace median_income = 72000 if msa=="portland" 
replace median_income = 96000 if msa=="sf" 
replace median_income = 83000 if msa=="seattle" 
replace median_income = 83000 if msa=="dc"

gen vlow_inc=(origin_inc<median_income*.66 & building_pct_student<.2)
gen high_inc=(origin_inc>median_income)


* add gentrification definitions
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000
keep if building_income_g==1 

* make some treatment dummies
gen sample=(building_built<2018)
gen gap=move_year-building_built

forvalues i=1(1)5{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & sample==1 
	replace tx_n`i'=1 if gap==-`i' & sample==1 

}

replace tx_n3 = 1 if gap<-3 & sample==1
replace tx_3 = 1 if gap>3 & sample==1

* cut to desired buildings and radius
keep if dist<.6
gen l250=(dist<.25)
	
* sample and weights 
gen building_sample=(inlist(building_built, 2014, 2015,2018)>0 ///
	& building_units>=50 & building_income_g==1 ///
	& move_year>2010 & buildings_within_250_built==0 ///
	& building_pct_student<.2 & currently_in_building==0)
	
	
bys building_id move_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

	
forvalues i=2011/2017{

	bys building_id: egen tx250_number_in_`i'=sum(move_year==`i' & building_sample==1 & l250==1)
	
}

egen tx250_min_number=rowmin(tx250_number_in_*)

forvalues i=2011/2017{

	bys building_id: egen tx256_number_in_`i'=sum(move_year==`i' & building_sample==1  & inrange(dist,.25,.6)>0)
	
}

egen tx256_min_number=rowmin(tx256_number_in_*)


* make your time trend variable
egen building_id_year=group(building_id move_year)
egen msa_year=group(msa move_year)
egen ring_fe=group(building_id l250)
gen gap=move_year-building_built
gen tx_1=(gap==1)

* treatment variable
gen sample = (building_built<2018)
gen after=(move_year>building_built)
replace after=1 if move_year>2015 & building_built==2018
gen after_250_treated=(sample==1 & l250==1 & move_year>building_built)


save  mig_analysis_file_ddd.dta,replace
 
	
}

end

********************************************************************************
*  construct far-far file file

program define far_far

{

* pull correct buildings
use rca_cleaned,clear
keep if yearblt_nb==2014 | yearblt_nb==2015 | yearblt_nb==2018 
rename latitude lat
rename longitude lon
save rca_2014_2015_2018,replace

* associate moves to nearest buildings in time frame
use mig_analysis_file,clear
keep building_id person_id round hhnum pdirect street st pstdirect city zip lat lon current_* origin* dest* move_year currently_in_building add_hhnum add_street
gen double row_id=_n
geonear row_id lat lon using   rca_2014_2015_2018.dta, n(base_id lat lon) near(1) genstub(base_id) wide

tostring base_id, replace
replace base_id = "" if km_to_base_id > 1
replace km_to_base_id = . if km_to_base_id > 1
rename km_to_base_id km_to_apt
rename building_id original_building_id

save raw_migration_long_201458,replace

use raw_migration_long_201458, clear

* add building and tract char
rename base_id id
destring(id), replace
drop if id== .

merge m:1 id using  apartment_distances.dta

drop if _merge==2
drop _merge

merge m:1 id using  rca_characteristics.dta , 
keep if _merge==3
drop _merge

rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

* rename 
rename (hhnum pdirect street st pstdirect city zip lon lat) ///
	(hhnum_building pdirect_building street_building st_building pstdirect_building city_building zip_building longitude_building latitude_building)
rename km_to_apt km_to_building
rename b_nearby* buildings_within*


* fix incorrect building assignments (can result from people living in earlier buildings that are near 2014-2015 buildings)
replace currently_in_building=0 ///
	if currently_in_building==1 & building_id != original_building_id

order person_id round building_id km_to_building currently_in_building move_year hhnum pdirect street pstdirect city zip lat lon ///
	current* origin* dest* *_building buildings_within* building_units 

* generate variables for regressions	
drop building_tag
rename km_to_building dist
gen log_inc=log(origin_inc)
gen log_college=log(origin_college)
gen log_white=log(origin_white)
keep if origin_inc!=.
keep if move_year>2010

* throw out people in the new buildings
bys building_id: egen any_match=sum(currently_in_building)
replace currently_in_building=1 if ///
	dist<.02 & any_match==0
	
* add origin income buckets
gen median_income=0
replace median_income = 66000 if msa=="atlanta" 
replace median_income = 74000 if msa=="austin" 
replace median_income = 69000 if msa=="chicago" 
replace median_income = 77000 if msa=="denver" 
replace median_income = 70000 if msa=="la" 
replace median_income = 76000 if msa=="brooklyn" 
replace median_income = 69000 if msa=="philadelphia" 
replace median_income = 72000 if msa=="portland" 
replace median_income = 96000 if msa=="sf" 
replace median_income = 83000 if msa=="seattle" 
replace median_income = 83000 if msa=="dc"

gen vlow_inc=(origin_inc<median_income*.66 & building_pct_student<.2)
gen high_inc=(origin_inc>median_income)


* add gentrification definitions
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000
keep if building_income_g==1 

* make some treatment dummies
gen sample=(building_built<2018)
gen gap=move_year-building_built

forvalues i=1(1)5{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & sample==1 
	replace tx_n`i'=1 if gap==-`i' & sample==1 

}

replace tx_n3 = 1 if gap<-3 & sample==1
replace tx_3 = 1 if gap>3 & sample==1

* make your time trend variable
egen msa_year=group(msa move_year)
gen zero=0

* cut to desired buildings and radius
keep if dist<.800 & dist>.25

gen building_sample=(inlist(building_built, 2014, 2015,2018)>0 ///
	& building_units>=50  & building_income_g==1 ///
	& move_year>2010 & buildings_within_250_built==0 ///
	& building_pct_student<.25 & currently_in_building==0)
	
forvalues i=2011/2017{

	bys building_id: egen number_in_`i'=sum(move_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)
drop if building_sample==0 

egen building_tag=tag(building_id)
tab msa building_built if building_tag==1 & min_number>0

drop if min_number<1
	
* weights for each sample
bys building_id move_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

* make treatment variable
gen treated=sample
gen treated_after=treated*(move_year>building_built)
 
save  mig_analysis_file_far_far.dta,replace
 
}

end

********************************************************************************
*  construct no pioneer near-near file

program define no_pioneer

{

* pull correct buildings
use rca_cleaned,clear
keep if yearblt_nb==2014 | yearblt_nb==2015 | yearblt_nb==2018 
rename latitude lat
rename longitude lon
save rca_2014_2015_2018,replace

* associate moves to nearest buildings in time frame
use mig_analysis_file,clear
keep building_id person_id round hhnum pdirect street st pstdirect city zip lat lon current_* origin* dest* move_year currently_in_building add_hhnum add_street
gen double row_id=_n
geonear row_id lat lon using   rca_2014_2015_2018.dta, n(base_id lat lon) near(1) genstub(base_id) wide

tostring base_id, replace
replace base_id = "" if km_to_base_id > 1
replace km_to_base_id = . if km_to_base_id > 1
rename km_to_base_id km_to_apt
rename building_id original_building_id

save raw_migration_long_201458,replace

use raw_migration_long_201458, clear

* add building and tract char
rename base_id id
destring(id), replace
drop if id== .

merge m:1 id using  apartment_distances.dta

drop if _merge==2
drop _merge

merge m:1 id using  rca_characteristics.dta , 
keep if _merge==3
drop _merge

rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

* rename 
rename (hhnum pdirect street st pstdirect city zip lon lat) ///
	(hhnum_building pdirect_building street_building st_building pstdirect_building city_building zip_building longitude_building latitude_building)
rename km_to_apt km_to_building
rename b_nearby* buildings_within*


* fix incorrect building assignments (can result from people living in earlier buildings that are near 2014-2015 buildings)
replace currently_in_building=0 ///
	if currently_in_building==1 & building_id != original_building_id

order person_id round building_id km_to_building currently_in_building move_year hhnum pdirect street pstdirect city zip lat lon ///
	current* origin* dest* *_building buildings_within* building_units 

* variables for regression
drop building_tag
rename km_to_building dist
gen log_inc=log(origin_inc)
gen log_college=log(origin_college)
gen log_white=log(origin_white)
keep if origin_inc!=.
keep if move_year>2010

* throw out maybe in the new building people
bys building_id: egen any_match=sum(currently_in_building)
replace currently_in_building=1 if ///
	dist<.02 & any_match==0
	
* add origin income buckets
gen median_income=0
replace median_income = 66000 if msa=="atlanta" 
replace median_income = 74000 if msa=="austin" 
replace median_income = 69000 if msa=="chicago" 
replace median_income = 77000 if msa=="denver" 
replace median_income = 70000 if msa=="la" 
replace median_income = 76000 if msa=="brooklyn" 
replace median_income = 69000 if msa=="philadelphia" 
replace median_income = 72000 if msa=="portland" 
replace median_income = 96000 if msa=="sf" 
replace median_income = 83000 if msa=="seattle" 
replace median_income = 83000 if msa=="dc"

gen vlow_inc=(origin_inc<median_income*.66 & building_pct_student<.2)
gen high_inc=(origin_inc>median_income)


* g defs
gen building_income_g=0
replace building_income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace building_income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace building_income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace building_income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace building_income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace building_income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace building_income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace building_income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace building_income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace building_income_g = 1 if msa=="dc" & building_med_hh_inc<83000
keep if building_income_g==1 

* cut to desired buildings and radius
keep if dist<(250/1000)

gen building_sample=(inlist(building_built, 2014, 2015,2018)>0 ///
	& building_units>=50  & building_income_g==1 ///
	& move_year>2010 & buildings_within_`dist'_built>-1 ///
	& building_pct_student<.25 & currently_in_building==0)
	
forvalues i=2011/2017{

	bys building_id: egen number_in_`i'=sum(move_year==`i' & building_sample==1)
	
}

egen min_number=rowmin(number_in_*)
drop if building_sample==0 

egen building_tag=tag(building_id)
tab msa building_built if building_tag==1 & min_number>0

drop if min_number<1
	
* weights for each sample
bys building_id move_year: egen building_sample_weight_d=sum(building_sample)
gen building_sample_weight=1/building_sample_weight_d	

* make some treatment dummies
gen sample=(building_built<2018)
gen gap=move_year-building_built

forvalues i=1(1)5{

	gen tx_`i'=0
	gen tx_n`i'=0
	
	replace tx_`i'=1 if gap==`i' & sample==1 
	replace tx_n`i'=1 if gap==-`i' & sample==1 

}

replace tx_n3 = 1 if gap<-3 & sample==1
replace tx_3 = 1 if gap>3 & sample==1

* make your time trend variable
egen msa_year=group(msa move_year)
gen zero=0
gen treated=sample
gen treated_after=treated*(move_year>building_built)

save  mig_analysis_file_no_pioneer.dta,replace

	
}


end

********************************************************************************
* get all moves near any 2014,2015 buildings, instead of just associating
* each move with closest building.  useful for summary stats.

program define near_far_quantity_file

{

use rca_cleaned,clear
keep if yearblt_nb==2014 | yearblt_nb==2015 
rename latitude lat
rename longitude lon
save rca_2014_2015,replace

use mig_analysis_file,clear
keep building_id person_id round hhnum pdirect street st pstdirect city zip lat lon current_* origin* dest* move_year currently_in_building
gen double row_id=_n
rename building_id original_building_id
save remerge_characteristics, replace

geonear row_id lat lon using   rca_2014_2015.dta, n(base_id lat lon) within(0.6) long
merge m:1 row_id using remerge_characteristics
keep if _merge==3
drop _merge

tostring base_id, replace
rename km_to_base_id km_to_apt
drop if km_to_apt>.6

save raw_migration_quantity_20145,replace


use raw_migration_quantity_20145, clear

* merge in characteristics
rename base_id id
destring(id), replace
merge m:1 id using  apartment_distances.dta

drop if _merge==2
drop _merge
drop if id== .

merge m:1 id using  rca_characteristics.dta , 
keep if _merge==3
drop _merge

rename building_tract GEOID
merge m:1 GEOID using all_tracts_2017,keepus(pct_students)
keep if _merge==3
drop _merge
rename pct_students building_pct_students
rename GEOID building_tract

* rename things and save
drop id
rename (hhnum pdirect street pstdirect city zip lon lat) ///
	(hhnum_building pdirect_building street_building pstdirect_building city_building zip_building longitude_building latitude_building)
rename km_to_apt km_to_building
rename b_nearby* buildings_within*

order person_id round building_id km_to_building currently_in_building move_year hhnum pdirect street pstdirect city zip lat lon ///
	current* origin* dest* *_building buildings_within* building_units 

* fix wrong building assignments
replace currently_in_building=0 ///
	if currently_in_building==1 & building_id != original_building_id 

	
save  mig_quantities_20145.dta,replace

}

end

********************************************************************************
*  collapse data to make summary statistics fast

program define collapse_to_summary

{

* start with in-migration

{

use mig_quantities_20145,clear
keep if move_year>2010
drop if origin_inc==. | origin_white==. 
replace km_to_building = 0 if currently_in_building==1

* income definitions 
gen median_income=0
replace median_income = 66000 if msa=="atlanta" 
replace median_income = 74000 if msa=="austin" 
replace median_income = 69000 if msa=="chicago" 
replace median_income = 77000 if msa=="denver" 
replace median_income = 70000 if msa=="la" 
replace median_income = 76000 if msa=="brooklyn" 
replace median_income = 69000 if msa=="philadelphia" 
replace median_income = 72000 if msa=="portland" 
replace median_income = 96000 if msa=="sf" 
replace median_income = 83000 if msa=="seattle" 
replace median_income = 83000 if msa=="dc"

* denote migrants to building and different rings
gen building_migrants=currently_in_building

gen close_migrants=(km_to_building<.25)
replace close_migrants=0 if currently_in_building==1

gen c400_migrants=(km_to_building<.4)
replace c400_migrants=0 if currently_in_building==1

gen gross_migrants=close_migrants+building_migrants
gen g400_migrants=c400_migrants+building_migrants

gen migrants_600=(km_to_building<.6 & km_to_building>.25)

* make the same variables with only those from wealthy tracts
gen rich_building_migrants=currently_in_building*(origin_inc>median_income)

gen rich_close_migrants=close_migrants
replace rich_close_migrants=0 if origin_inc<median_income

gen rich_c400_migrants=c400_migrants
replace rich_c400_migrants=0 if origin_inc<median_income

gen gross_migrants_rich=rich_close_migrants+rich_building_migrants
gen g400_migrants_rich=rich_c400_migrants+rich_building_migrants

gen rich_migrants_600=migrants_600 * (origin_inc>median_income)

* make the same variables with only those from poor tracts
gen poor_building_migrants=currently_in_building*(origin_inc<median_income & origin_student<.2)

gen poor_close_migrants=close_migrants
replace poor_close_migrants=0 if origin_inc>median_income | origin_student>.2

gen poor_c400_migrants=c400_migrants
replace poor_c400_migrants=0 if origin_inc>median_income | origin_student>.2

gen gross_migrants_poor=poor_close_migrants+poor_building_migrants
gen g400_migrants_poor=poor_c400_migrants+poor_building_migrants

gen poor_migrants_600=migrants_600 * (origin_inc<median_income & origin_student<.2)

gen vpoor_building_migrants=currently_in_building*(origin_inc<median_income*.66 & origin_student<.2)

gen vpoor_close_migrants=close_migrants
replace vpoor_close_migrants=0 if origin_inc>median_income*.66| origin_student>.2

gen vpoor_c400_migrants=c400_migrants
replace vpoor_c400_migrants=0 if origin_inc>median_income*.66| origin_student>.2

gen gross_migrants_vpoor=vpoor_close_migrants+vpoor_building_migrants
gen g400_migrants_vpoor=vpoor_c400_migrants+vpoor_building_migrants

gen vpoor_migrants_600=migrants_600 * (origin_inc<median_income*.66 & origin_student<.2)

* collapse to building/year level
collapse (first) msa building_med_hh_inc building_pct_white building_pct_college building_built ///
	buildings_with*_built building_units building_pct_student ///
	(sum) *c400* *g400* *600 *close_migrants *building_migrants gross* ///
	, by(building_id move_year)

* rename things
rename (gross_migrants close_migrants building_migrants c400_migrants g400_migrants) ///
	(gross_migrants_total total_close_migrants total_building_migrants total_c400_migrants total_g400_migrants)
rename *_total total_*
rename gross_migrants_* *_gross_migrants
rename g400_migrants_* *_g400_migrants
rename migrants_600 total_migrants_600

* do some cleaning/add some variables needed to match analysis
gen years_to_building=move_year - building_built
gen income_g=0
replace income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace income_g = 1 if msa=="dc" & building_med_hh_inc<83000

keep if buildings_within_250_built==0
keep if inrange(move_year, 2011,2017)>0
keep if income_g==1
keep if building_units>=50  
keep if building_pct_student<.25 

* check statistics for just surviving sample
gen zero=(total_close_migrants==0)
bys building_id: egen ever_zero=sum(zero)
drop if ever_zero>0
bys building_id: egen any_in_building=sum(total_building_migrants)
replace any_in_building=1 if any_in_build>1
tab move_year any_in_build,row
keep if any_in_building==1
	
* remove people who moved into building address before it was constructed
local type_list total rich poor vpoor
foreach type in `type_list'{

	* add the people in people to the close total more than a year before building
	replace `type'_close_migrants=`type'_close_migrants + `type'_building_migrants if years_to_building<-1
	replace `type'_c400_migrants=`type'_c400_migrants + `type'_building_migrants if years_to_building<-1

	* add the people to the building in the completion year if a years before
	gen misclassified_`type'=(inrange(years_to_building,-1,-1))*`type'_building_migrants
	bys building_id: egen total_misclassified_`type'=sum(misclassified_`type')
	replace `type'_building_migrants=`type'_building_migrants+total_misclassified_`type' ///
		if years_to_building==0

	* get the people you reclassified to zero out of the total count in -1
	replace `type'_gross_migrants = `type'_gross_migrants - `type'_building_migrants ///
		if inrange(years_to_building,-1,-1)>0
		
	replace `type'_g400_migrants = `type'_g400_migrants - `type'_building_migrants ///
		if inrange(years_to_building,-1,-1)>0
	
	* finally reset building migrants to 0 in the pre-period
	replace `type'_building_migrants=0 if years_to_building<0

	drop *misclassified*	

}	

* get demeaned by year figures,  using gross in-migration to area around isolated old buildings

bys move_year: egen mean_migrants=mean(total_migrants_600)
egen mean_migrants_2013=max(mean_migrants*(move_year==2013))
gen year_adjuster=mean_migrants_2013/mean_migrants

* loop over all types of migrants
local type_list total rich  poor vpoor 

* normalize by year
foreach type in `type_list'{

	local vars gross_migrants g400_migrants close_migrants c400_migrants building_migrants migrants_600
	foreach var in `vars'{

		gen an_`type'_`var'=`type'_`var'*year_adjuster

	}

	
}

save in_mig_summary_file,replace

}

*  do all the same things for out-migration
{

use mig_quantities_20145,clear
keep if move_year>2010
drop if dest_inc==. | dest_white==. 
replace km_to_building = 0 if currently_in_building==1

* income definitions 
gen median_income=0
replace median_income = 66000 if msa=="atlanta" 
replace median_income = 74000 if msa=="austin" 
replace median_income = 69000 if msa=="chicago" 
replace median_income = 77000 if msa=="denver" 
replace median_income = 70000 if msa=="la" 
replace median_income = 76000 if msa=="brooklyn" 
replace median_income = 69000 if msa=="philadelphia" 
replace median_income = 72000 if msa=="portland" 
replace median_income = 96000 if msa=="sf" 
replace median_income = 83000 if msa=="seattle" 
replace median_income = 83000 if msa=="dc"

* close migrants, migrants to building, migrants within 600
gen building_migrants=currently_in_building

gen close_migrants=(km_to_building<.25)
replace close_migrants=0 if currently_in_building==1

gen c400_migrants=(km_to_building<.4)
replace c400_migrants=0 if currently_in_building==1

gen gross_migrants=close_migrants+building_migrants
gen g400_migrants=c400_migrants+building_migrants

gen migrants_600=(km_to_building<.6 & km_to_building>.25)

* make the same variables with only those from wealthy tracts
gen rich_building_migrants=currently_in_building*(dest_inc>median_income)

gen rich_close_migrants=close_migrants
replace rich_close_migrants=0 if dest_inc<median_income

gen rich_c400_migrants=c400_migrants
replace rich_c400_migrants=0 if dest_inc<median_income

gen gross_migrants_rich=rich_close_migrants+rich_building_migrants
gen g400_migrants_rich=rich_c400_migrants+rich_building_migrants

gen rich_migrants_600=migrants_600 * (dest_inc>median_income)

* make the same variables with only those from poor tracts
gen poor_building_migrants=currently_in_building*(dest_inc<median_income & dest_student<.2)

gen poor_close_migrants=close_migrants
replace poor_close_migrants=0 if dest_inc>median_income | dest_student>.2

gen poor_c400_migrants=c400_migrants
replace poor_c400_migrants=0 if dest_inc>median_income | dest_student>.2

gen gross_migrants_poor=poor_close_migrants+poor_building_migrants
gen g400_migrants_poor=poor_c400_migrants+poor_building_migrants

gen poor_migrants_600=migrants_600 * (dest_inc<median_income & dest_student<.2)

gen vpoor_building_migrants=currently_in_building*(dest_inc<median_income*.66 & dest_student<.2)

gen vpoor_close_migrants=close_migrants
replace vpoor_close_migrants=0 if dest_inc>median_income*.66| dest_student>.2

gen vpoor_c400_migrants=c400_migrants
replace vpoor_c400_migrants=0 if dest_inc>median_income*.66| dest_student>.2

gen gross_migrants_vpoor=vpoor_close_migrants+vpoor_building_migrants
gen g400_migrants_vpoor=vpoor_c400_migrants+vpoor_building_migrants

gen vpoor_migrants_600=migrants_600 * (dest_inc<median_income*.66 & dest_student<.2)

* collapse to building/year level
collapse (first) msa building_med_hh_inc building_pct_white building_pct_college building_built ///
	buildings_with*_built building_units building_pct_student ///
	(sum)  *c400* *g400* *600 *close_migrants *building_migrants gross* ///
	, by(building_id move_year)

* rename things
rename (gross_migrants close_migrants building_migrants c400_migrants g400_migrants) ///
	(gross_migrants_total total_close_migrants total_building_migrants total_c400_migrants total_g400_migrants)
rename *_total total_*
rename gross_migrants_* *_gross_migrants
rename g400_migrants_* *_g400_migrants
rename migrants_600 total_migrants_600

save collapsed_migration_destination, replace

}


* combine to make a net migration file 
{

* first, merge in in-migration so that you can compute net
use collapsed_migration_destination, clear
rename *vpoor* *d_vpoor*
rename *rich* *d_rich*

gen years_to_building=move_year - building_built
gen income_g=0
replace income_g = 1 if msa=="atlanta" & building_med_hh_inc<66000
replace income_g = 1 if msa=="austin" & building_med_hh_inc<74000
replace income_g = 1 if msa=="chicago" & building_med_hh_inc<69000
replace income_g = 1 if msa=="denver" & building_med_hh_inc<77000
replace income_g = 1 if msa=="la" & building_med_hh_inc<70000
replace income_g = 1 if msa=="brooklyn" & building_med_hh_inc<76000
replace income_g = 1 if msa=="philadelphia" & building_med_hh_inc<69000
replace income_g = 1 if msa=="portland" & building_med_hh_inc<72000
replace income_g = 1 if msa=="sf" & building_med_hh_inc<96000
replace income_g = 1 if msa=="seattle" & building_med_hh_inc<83000
replace income_g = 1 if msa=="dc" & building_med_hh_inc<83000

keep if buildings_within_250_built==0
keep if inrange(move_year, 2011,2017)>0
keep if income_g==1
keep if building_units>=50  
keep if building_pct_student<.25 

merge 1:1 building_id move_year using in_mig_summary_file, keepus(vpoor* rich* an_vpoor* an_rich*)
keep if _merge==3
drop _merge

* hack out people who moved into building address before it was constructed
local type_list d_vpoor d_rich
foreach type in `type_list'{

	* add the people in people to the close total more than a year before building
	replace `type'_close_migrants=`type'_close_migrants + `type'_building_migrants if years_to_building<-1

	* add the people to the building in the completion year if a years before
	gen misclassified_`type'=(inrange(years_to_building,-1,-1))*`type'_building_migrants
	bys building_id: egen total_misclassified_`type'=sum(misclassified_`type')
	replace `type'_building_migrants=`type'_building_migrants+total_misclassified_`type' ///
		if years_to_building==0

	* get the people you reclassified to zero out of the total count in -1
	replace `type'_gross_migrants = `type'_gross_migrants - `type'_building_migrants ///
		if inrange(years_to_building,-1,-1)>0
	
	* finally reset building migrants to 0 in the pre-period
	replace `type'_building_migrants=0 if years_to_building<0

	drop *misclassified*	

}	

* we can do some normalizing as well
* get demeaned by year figures,  using gross out-migration

bys move_year: egen mean_migrants=mean(total_migrants_600)
egen mean_migrants_2013=max(mean_migrants*(move_year==2013))
gen year_adjuster=mean_migrants_2013/mean_migrants

* loop over all types of migrants
local type_list d_vpoor d_rich

* normalize by year (note this is doing out with a different base than in)
foreach type in `type_list'{

	local vars gross_migrants close_migrants building_migrants migrants_600
	foreach var in `vars'{

		gen an_`type'_`var'=`type'_`var'*year_adjuster

	}

	
}

* start with net migration from very poor areas
gen vpoor_gross_migrants_net=vpoor_gross_migrants-d_vpoor_gross_migrants
gen vpoor_close_migrants_net=vpoor_close_migrants-d_vpoor_close_migrants
gen vpoor_building_migrants_net=vpoor_building_migrants-d_vpoor_building_migrants

gen an_vpoor_gross_migrants_net=an_vpoor_gross_migrants-an_d_vpoor_gross_migrants
gen an_vpoor_close_migrants_net=an_vpoor_close_migrants-an_d_vpoor_close_migrants
gen an_vpoor_building_migrants_net=an_vpoor_building_migrants-an_d_vpoor_building_migrants

* repeat with net migration from rich areas
gen rich_gross_migrants_net=rich_gross_migrants-d_rich_gross_migrants
gen rich_close_migrants_net=rich_close_migrants-d_rich_close_migrants
gen rich_building_migrants_net=rich_building_migrants-d_rich_building_migrants

gen an_rich_gross_migrants_net=an_rich_gross_migrants-an_d_rich_gross_migrants
gen an_rich_close_migrants_net=an_rich_close_migrants-an_d_rich_close_migrants
gen an_rich_building_migrants_net=an_rich_building_migrants-an_d_rich_building_migrants

save net_mig_summary_file, replace

}






}

end




